{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f4b72fd7",
   "metadata": {},
   "source": [
    "## Description:\n",
    "这个是sharedBottom模型的demo, 尝试在中级API的基础上，加一些loss优化的思路， 这次是DWA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "324f6ca1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:\n",
      "DeepCTR version 0.9.0 detected. Your version is 0.8.2.\n",
      "Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From C:\\Users\\ZHONGQ~1\\AppData\\Local\\Temp/ipykernel_80708/534147527.py:26: The name tf.keras.backend.set_session is deprecated. Please use tf.compat.v1.keras.backend.set_session instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From C:\\Users\\ZHONGQ~1\\AppData\\Local\\Temp/ipykernel_80708/534147527.py:26: The name tf.keras.backend.set_session is deprecated. Please use tf.compat.v1.keras.backend.set_session instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import math\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import random\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error\n",
    "from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat\n",
    "from deepctr.feature_column import get_feature_names\n",
    "from SharedBottom import SharedBottom\n",
    "\n",
    "import tensorflow as tf\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "config = tf.compat.v1.ConfigProto()\n",
    "config.gpu_options.allow_growth = True      # TensorFlow按需分配显存\n",
    "config.gpu_options.per_process_gpu_memory_fraction = 0.5  # 指定显存分配比例\n",
    "tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))\n",
    "\n",
    "# tf.config.experimental_run_functions_eagerly(True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "577efc09",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = '../data_process'\n",
    "data = pd.read_csv(os.path.join(data_path, 'train_data.csv'), index_col=0, parse_dates=['expo_time'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "88a84ee4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选择出需要用到的列\n",
    "use_cols = ['user_id', 'article_id', 'expo_time', 'net_status', 'exop_position', 'duration', 'device', 'city', 'age', 'gender', 'img_num', 'cat_1', 'click']\n",
    "data_new = data[use_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ce2ea472",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 由于这个data_new的数据量还是太大， 我电脑训练不动， 所以这里再进行一波抽样\n",
    "users = set(data_new['user_id'])\n",
    "sampled_users = random.sample(users, 1000)\n",
    "data_new = data_new[data_new['user_id'].isin(sampled_users)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df0b628c",
   "metadata": {},
   "source": [
    "## 数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e380f114",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 处理img_num\n",
    "def transform(x):\n",
    "    if x == '上海':\n",
    "        return 0\n",
    "    elif isinstance(x, float):\n",
    "        return float(x)\n",
    "    else:\n",
    "        return float(eval(x))\n",
    "data_new['img_num'] = data_new['img_num'].apply(lambda x: transform(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "46f77eb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "user_id_raw = data_new[['user_id']].drop_duplicates('user_id')\n",
    "doc_id_raw = data_new[['article_id']].drop_duplicates('article_id')\n",
    "\n",
    "# 简单数据预处理\n",
    "sparse_features = [\n",
    "    'user_id', 'article_id', 'net_status', 'exop_position', 'device', 'city', 'age', 'gender', 'cat_1'\n",
    "]\n",
    "dense_features = [\n",
    "    'img_num'\n",
    "]\n",
    "\n",
    "# 填充缺失值\n",
    "data_new[sparse_features] = data_new[sparse_features].fillna('-1')\n",
    "data_new[dense_features] = data_new[dense_features].fillna(0)\n",
    "\n",
    "# 归一化\n",
    "mms = MinMaxScaler(feature_range=(0, 1))\n",
    "data_new[dense_features] = mms.fit_transform(data_new[dense_features])\n",
    "\n",
    "feature_max_idx = {}\n",
    "for feat in sparse_features:\n",
    "    lbe = LabelEncoder()\n",
    "    data_new[feat] = lbe.fit_transform(data_new[feat])\n",
    "    feature_max_idx[feat] = data_new[feat].max() + 1000\n",
    "\n",
    "# 构建用户id词典和doc的id词典，方便从用户idx找到原始的id\n",
    "# user_id_enc = data[['user_id']].drop_duplicates('user_id')\n",
    "# doc_id_enc = data[['article_id']].drop_duplicates('article_id')\n",
    "# user_idx_2_rawid = dict(zip(user_id_enc['user_id'], user_id_raw['user_id']))\n",
    "# doc_idx_2_rawid = dict(zip(doc_id_enc['article_id'], doc_id_raw['article_id']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ad2691ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 划分数据集  这里按照曝光时间划分\n",
    "train_data = data_new[data_new['expo_time'] < '2021-07-03']\n",
    "test_data = data_new[data_new['expo_time'] >= '2021-07-06']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad199918",
   "metadata": {},
   "source": [
    "## 特征封装"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d0e5ea3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "sparse_feature_columns = [SparseFeat(feat, feature_max_idx[feat], embedding_dim=4) for feat in sparse_features]\n",
    "Dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "c9f538ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 划分dnn和linear特征\n",
    "dnn_features_columns = sparse_feature_columns + Dense_feature_columns\n",
    "lhuc_feature_columns = sparse_feature_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f5355501",
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = get_feature_names(dnn_features_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c6a5f286",
   "metadata": {},
   "outputs": [],
   "source": [
    "# AttributeError: 'numpy.dtype[int64]' object has no attribute 'base_dtype' \n",
    "# Keras需要把输入声明为Keras张量，其他的比如numpy张量作为输入不好使\n",
    "train_model_input = {name: tf.keras.backend.constant(train_data[name]) for name in feature_names}\n",
    "test_model_input = {name: tf.keras.backend.constant(test_data[name]) for name in feature_names}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6e4803b7",
   "metadata": {},
   "source": [
    "## 模型搭建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "9b0270fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SharedBottom(dnn_features_columns, lhuc_feature_columns, tower_dnn_hidden_units=[], task_types=['regression', 'binary'], \n",
    "             task_names=['duration', 'click'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "df05fadd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "25236735",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 绘制模型结构  \n",
    "# GraphViz's executables not found  \n",
    "# 安装软件，配置环境变量即可  https://graphviz.org/download/\n",
    "# from tensorflow import keras\n",
    "# keras.utils.plot_model(model, to_file='./shared_bottom.png', show_shapes=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eb7b39f2",
   "metadata": {},
   "source": [
    "## 模型的训练和预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "4f373ebd",
   "metadata": {},
   "outputs": [],
   "source": [
    "label_duration = tf.keras.backend.constant(train_data['duration'].values)\n",
    "label_click = tf.keras.backend.constant(train_data['click'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "01764354",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 构建数据管道\n",
    "batch_size = 128\n",
    "train_ds = tf.data.Dataset.from_tensor_slices((train_model_input, (label_duration, label_click))).shuffle(buffer_size=100).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "e9d77f25",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 模型训练这里，需要用到底层的训练脚本，这里不能用高层keras的API\n",
    "optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)\n",
    "\n",
    "train_loss = tf.keras.metrics.Mean(name='train_loss')\n",
    "train_reg_loss = tf.keras.metrics.Mean(name='train_reg_loss')\n",
    "train_bin_loss = tf.keras.metrics.Mean(name='train_bin_loss')\n",
    "loss_func = {\"binary\": tf.keras.losses.binary_crossentropy, \"regression\": tf.keras.losses.mean_squared_error}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "ae2f0ef9",
   "metadata": {},
   "outputs": [],
   "source": [
    "@tf.function\n",
    "def train_step(features, labels, task_types, weight=[0.02, 0.98]):\n",
    "    losses = []\n",
    "    \n",
    "    with tf.GradientTape() as tape:\n",
    "        # 遍历每个任务\n",
    "        for i, task_type in enumerate(task_types):\n",
    "            out = model(features, training=True)\n",
    "            task_loss = loss_func[task_types[i]](out[i], labels[i])\n",
    "\n",
    "            losses.append(weight[i] * task_loss)\n",
    "            \n",
    "        loss = tf.add_n(losses)\n",
    "        gradients = tape.gradient(loss, model.trainable_variables)\n",
    "    \n",
    "    optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
    "    train_loss.update_state(loss)\n",
    "    train_reg_loss.update_state(losses[0])\n",
    "    train_bin_loss.update_state(losses[1])\n",
    "    return loss, losses[0], losses[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ca4d17f6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 10%|████████▎                                                                          | 1/10 [00:22<03:22, 22.54s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 0, Loss: 6638.009765625 - regression_loss: 6635.50830078125 - binary_loss:2.4948813915252686, loss_weight: 1.0-1.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 20%|████████████████▌                                                                  | 2/10 [00:33<02:06, 15.79s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1, Loss: 6536.70849609375 - regression_loss: 6534.333984375 - binary_loss:2.371436595916748, loss_weight: 1.0-1.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 30%|████████████████████████▉                                                          | 3/10 [00:44<01:34, 13.47s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 2, Loss: 6560.888671875 - regression_loss: 6558.53466796875 - binary_loss:2.3511602878570557, loss_weight: 1.0085577964782715-0.9914422631263733\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 40%|█████████████████████████████████▏                                                 | 4/10 [00:55<01:14, 12.45s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 3, Loss: 6507.11328125 - regression_loss: 6504.74609375 - binary_loss:2.3642385005950928, loss_weight: 1.003063440322876-0.996936559677124\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 50%|█████████████████████████████████████████▌                                         | 5/10 [01:06<00:59, 11.94s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 4, Loss: 6453.0849609375 - regression_loss: 6450.70703125 - binary_loss:2.379669189453125, loss_weight: 0.9965590834617615-1.0034409761428833\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 60%|█████████████████████████████████████████████████▊                                 | 6/10 [01:17<00:46, 11.56s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 5, Loss: 6442.40771484375 - regression_loss: 6440.0302734375 - binary_loss:2.38036847114563, loss_weight: 0.99629145860672-1.0037086009979248\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 70%|██████████████████████████████████████████████████████████                         | 7/10 [01:27<00:33, 11.33s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 6, Loss: 6460.50537109375 - regression_loss: 6458.13720703125 - binary_loss:2.3726625442504883, loss_weight: 0.999512791633606-1.000487208366394\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [01:38<00:22, 11.19s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 7, Loss: 6465.93701171875 - regression_loss: 6463.568359375 - binary_loss:2.368056297302246, loss_weight: 1.0015122890472412-0.9984877705574036\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      " 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [01:49<00:11, 11.07s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 8, Loss: 6456.97314453125 - regression_loss: 6454.60400390625 - binary_loss:2.3698625564575195, loss_weight: 1.0006955862045288-0.9993044137954712\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [02:00<00:00, 12.03s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 9, Loss: 6449.74609375 - regression_loss: 6447.37109375 - binary_loss:2.372725009918213, loss_weight: 0.9994626045227051-1.000537395477295\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "epochs = 10\n",
    "K = 2\n",
    "T = 2\n",
    "batch_nums = math.ceil(train_data.shape[0] / batch_size)\n",
    "\n",
    "task_types = [\"regression\", \"binary\"]\n",
    "\n",
    "# 这里的task_weight 就不用tf.Variables了，因为不用梯度更新\n",
    "task_weight = np.zeros([2, epochs], dtype=np.float32)\n",
    "avg_cost = np.zeros([epochs, 2], dtype=np.float32)  # reg_loss, bin_loss\n",
    "\n",
    "dynamic_weight_average = True\n",
    "\n",
    "for epoch in tqdm(range(epochs)):\n",
    "    \n",
    "    # 累加损失\n",
    "    # acc_cost = np.zeros(2, dtype=np.float32)\n",
    "    \n",
    "    # 如果使用动态加权平均，注意依然是epoch层面, 更新权重\n",
    "    if dynamic_weight_average:\n",
    "        # 初始化\n",
    "        if epoch == 0 or epoch == 1:\n",
    "            w_1 = 1.0\n",
    "            w_2 = 1.0\n",
    "            task_weight[0, epoch] = K*np.exp(w_1/T) / (np.exp(w_1/T) + np.exp(w_2/T))\n",
    "            task_weight[1, epoch] = K*np.exp(w_2/T) / (np.exp(w_1/T) + np.exp(w_2/T))\n",
    "        else: \n",
    "            # 获取每个任务的loss下降比率\n",
    "            w_1 = avg_cost[epoch-1, 0] / avg_cost[epoch-2, 0]\n",
    "            w_2 = avg_cost[epoch-1, 1] / avg_cost[epoch-2, 1]\n",
    "            # 修改权重\n",
    "            task_weight[0, epoch] = K*np.exp(w_1/T) / (np.exp(w_1/T) + np.exp(w_2/T))\n",
    "            task_weight[1, epoch] = K*np.exp(w_2/T) / (np.exp(w_1/T) + np.exp(w_2/T))\n",
    "    else:\n",
    "        task_weight[0, epoch], task_weight[1, epoch] = 1.0, 1.0\n",
    "    \n",
    "    train_loss.reset_states()\n",
    "    train_reg_loss.reset_states()\n",
    "    train_bin_loss.reset_states()\n",
    "\n",
    "    for feature, labels in train_ds:\n",
    "        loss, loss_reg, loss_bin = train_step(feature, labels, task_types, task_weight[:,  epoch])\n",
    "        \n",
    "#         acc_cost[0] += sum(loss_reg) / batch_size\n",
    "#         acc_cost[1] += sum(loss_bin) / batch_size\n",
    "        \n",
    "    # 更新avg_cost train_reg_loss.result算的就是平均损失， 每个batch的平均损失之和/batch_num\n",
    "    avg_cost[epoch, 0] = train_reg_loss.result()\n",
    "    avg_cost[epoch, 1] = train_bin_loss.result()\n",
    "    # print(avg_cost[epoch], acc_cost/batch_nums)   这两个等价， 但差距就是有的不够batch_size个的，后面这个会按照batch_size个计算，所以不如前面这个精准\n",
    "    \n",
    "    template = 'Epoch {}, Loss: {} - regression_loss: {} - binary_loss:{}, loss_weight: {}-{}'\n",
    "    print(template.format(epoch, train_loss.result(), \n",
    "                          train_reg_loss.result(),\n",
    "                          train_bin_loss.result(), task_weight[0, epoch], task_weight[1, epoch]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "8d671e57",
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_ans = model.predict(test_model_input, batch_size=256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "32dc4aa3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test click AUC 0.491\n"
     ]
    }
   ],
   "source": [
    "print(\"test click AUC\", round(roc_auc_score(test_data['click'], pred_ans[1]), 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "6c377e25",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test duration 39.7805\n"
     ]
    }
   ],
   "source": [
    "print(\"test duration\", round(mean_absolute_error(test_data['duration'], pred_ans[0]), 4))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
